In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFECV
In [19]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [20]:
import warnings
warnings.filterwarnings('ignore')
In [21]:
initial_data = pd.read_csv('/Users/MichaelMiao/Documents/career/Jupyter_files/Python/Yuyan/train_hot.csv',index_col=None)
In [23]:
initial_data.head()
Out[23]:
Unnamed: 0 HTWins id gameID VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 1 No 5041 120051 0 0 1 0 0 0 ... 1.85756 4.66096 0.83711 -1.44583 3.87948 1.91473 2.61613 3.91852 4.57092 4.53993
1 2 Yes 10666 120052 0 0 0 0 0 0 ... 0.70504 1.39478 0.50301 -2.27458 0.71005 1.11215 2.12085 0.55482 4.46708 4.53090
2 3 No 7852 120053 0 0 0 0 0 0 ... 0.84386 4.54963 4.09241 0.76311 1.63626 0.85108 1.77748 0.63992 4.60701 4.53979
3 4 No 3824 120054 0 0 0 0 0 0 ... 0.99371 1.43989 1.19864 -1.16884 1.63606 1.21352 1.07811 0.53820 4.53165 4.53712
4 5 Yes 5876 120055 0 0 0 0 0 0 ... 1.61258 4.17863 0.83442 -2.00100 2.93086 1.94685 3.39644 1.04180 4.53202 4.50515

5 rows × 273 columns

In [24]:
initial_data=initial_data.iloc[0:,1:]
In [25]:
initial_data['HTWins']=initial_data['HTWins'].map({'Yes':1,'No':0})
In [26]:
initial_data.head(3)
Out[26]:
HTWins id gameID VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 0 5041 120051 0 0 1 0 0 0 0 ... 1.85756 4.66096 0.83711 -1.44583 3.87948 1.91473 2.61613 3.91852 4.57092 4.53993
1 1 10666 120052 0 0 0 0 0 0 0 ... 0.70504 1.39478 0.50301 -2.27458 0.71005 1.11215 2.12085 0.55482 4.46708 4.53090
2 0 7852 120053 0 0 0 0 0 0 0 ... 0.84386 4.54963 4.09241 0.76311 1.63626 0.85108 1.77748 0.63992 4.60701 4.53979

3 rows × 272 columns

In [27]:
tester= pd.read_csv('/Users/MichaelMiao/Documents/career/Jupyter_files/Python/Yuyan/test_hot.csv',index_col=None)
In [28]:
tester.head(3)
tester=tester.iloc[0:,1:]
Out[28]:
Unnamed: 0 id gameID VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 1 11609 129571 0 0 0 0 0 0 0 ... 0.96095 2.01374 2.66922 1.57905 2.58901 0.93894 2.10174 0.52678 4.67913 4.65345
1 2 11625 129572 0 0 0 0 0 0 0 ... 1.72070 3.60890 0.54048 2.26119 4.78640 1.10016 5.78629 3.17932 4.71271 4.64891
2 3 13795 129573 0 1 0 0 0 0 0 ... 1.87974 1.99348 1.30515 2.33386 6.09667 3.55171 6.57626 0.84867 4.63944 4.65157

3 rows × 272 columns

In [29]:
tester.head(3)
Out[29]:
id gameID VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO VT_CEBS ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 11609 129571 0 0 0 0 0 0 0 0 ... 0.96095 2.01374 2.66922 1.57905 2.58901 0.93894 2.10174 0.52678 4.67913 4.65345
1 11625 129572 0 0 0 0 0 0 0 0 ... 1.72070 3.60890 0.54048 2.26119 4.78640 1.10016 5.78629 3.17932 4.71271 4.64891
2 13795 129573 0 1 0 0 0 0 0 0 ... 1.87974 1.99348 1.30515 2.33386 6.09667 3.55171 6.57626 0.84867 4.63944 4.65157

3 rows × 271 columns

In [ ]:
 
In [30]:
initial_data.drop(initial_data[['date']], axis=1, inplace=True)
initial_data.drop(initial_data[['id']], axis=1, inplace=True)
initial_data.drop(initial_data[['gameID']], axis=1, inplace=True)
initial_data.head()
tester.drop(tester[['date']], axis=1, inplace=True)
tester.drop(tester[['id']], axis=1, inplace=True)
tester.drop(tester[['gameID']], axis=1, inplace=True)
tester.head()
Out[30]:
HTWins VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO VT_CEBS VT_HZTL ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 0 0 0 1 0 0 0 0 0 0 ... 1.85756 4.66096 0.83711 -1.44583 3.87948 1.91473 2.61613 3.91852 4.57092 4.53993
1 1 0 0 0 0 0 0 0 0 0 ... 0.70504 1.39478 0.50301 -2.27458 0.71005 1.11215 2.12085 0.55482 4.46708 4.53090
2 0 0 0 0 0 0 0 0 0 0 ... 0.84386 4.54963 4.09241 0.76311 1.63626 0.85108 1.77748 0.63992 4.60701 4.53979
3 0 0 0 0 0 0 0 0 0 0 ... 0.99371 1.43989 1.19864 -1.16884 1.63606 1.21352 1.07811 0.53820 4.53165 4.53712
4 1 0 0 0 0 0 0 0 0 0 ... 1.61258 4.17863 0.83442 -2.00100 2.93086 1.94685 3.39644 1.04180 4.53202 4.50515

5 rows × 269 columns

Out[30]:
VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO VT_CEBS VT_HZTL VT_IMUS ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 0 0 0 0 0 0 0 0 0 0 ... 0.96095 2.01374 2.66922 1.57905 2.58901 0.93894 2.10174 0.52678 4.67913 4.65345
1 0 0 0 0 0 0 0 0 1 0 ... 1.72070 3.60890 0.54048 2.26119 4.78640 1.10016 5.78629 3.17932 4.71271 4.64891
2 0 1 0 0 0 0 0 0 0 0 ... 1.87974 1.99348 1.30515 2.33386 6.09667 3.55171 6.57626 0.84867 4.63944 4.65157
3 0 0 0 0 0 0 0 0 0 0 ... 1.55071 3.15794 0.62523 -0.03972 1.54829 1.41158 3.87761 0.09846 4.65703 4.61190
4 0 0 0 0 1 0 0 0 0 0 ... 1.51754 4.42927 0.42645 -1.00849 1.37460 0.59113 1.36368 0.99959 4.62911 4.65295

5 rows × 268 columns

In [31]:
tester.head()
initial_data.head()
Out[31]:
VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO VT_CEBS VT_HZTL VT_IMUS ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 0 0 0 0 0 0 0 0 0 0 ... 0.96095 2.01374 2.66922 1.57905 2.58901 0.93894 2.10174 0.52678 4.67913 4.65345
1 0 0 0 0 0 0 0 0 1 0 ... 1.72070 3.60890 0.54048 2.26119 4.78640 1.10016 5.78629 3.17932 4.71271 4.64891
2 0 1 0 0 0 0 0 0 0 0 ... 1.87974 1.99348 1.30515 2.33386 6.09667 3.55171 6.57626 0.84867 4.63944 4.65157
3 0 0 0 0 0 0 0 0 0 0 ... 1.55071 3.15794 0.62523 -0.03972 1.54829 1.41158 3.87761 0.09846 4.65703 4.61190
4 0 0 0 0 1 0 0 0 0 0 ... 1.51754 4.42927 0.42645 -1.00849 1.37460 0.59113 1.36368 0.99959 4.62911 4.65295

5 rows × 268 columns

Out[31]:
HTWins VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO VT_CEBS VT_HZTL ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 0 0 0 1 0 0 0 0 0 0 ... 1.85756 4.66096 0.83711 -1.44583 3.87948 1.91473 2.61613 3.91852 4.57092 4.53993
1 1 0 0 0 0 0 0 0 0 0 ... 0.70504 1.39478 0.50301 -2.27458 0.71005 1.11215 2.12085 0.55482 4.46708 4.53090
2 0 0 0 0 0 0 0 0 0 0 ... 0.84386 4.54963 4.09241 0.76311 1.63626 0.85108 1.77748 0.63992 4.60701 4.53979
3 0 0 0 0 0 0 0 0 0 0 ... 0.99371 1.43989 1.19864 -1.16884 1.63606 1.21352 1.07811 0.53820 4.53165 4.53712
4 1 0 0 0 0 0 0 0 0 0 ... 1.61258 4.17863 0.83442 -2.00100 2.93086 1.94685 3.39644 1.04180 4.53202 4.50515

5 rows × 269 columns

In [32]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(initial_data['HTWins'],label="Sum")

plt.show()
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1c6405f8>
In [33]:
X = initial_data.drop(['HTWins'],axis=1)
X.head()
Out[33]:
VT_AJAX VT_BASI VT_BATA VT_BATG VT_BCCS VT_BOAN VT_CALO VT_CEBS VT_HZTL VT_IMUS ... HT.OS4.to HT.OS4.fgm HT.OS4.oreb HT.OS5.plmin HT.OS5.dreb HT.OS5.to HT.OS5.fgm HT.OS5.oreb HT.pmxU HT.pmxW
0 0 0 1 0 0 0 0 0 0 0 ... 1.85756 4.66096 0.83711 -1.44583 3.87948 1.91473 2.61613 3.91852 4.57092 4.53993
1 0 0 0 0 0 0 0 0 0 0 ... 0.70504 1.39478 0.50301 -2.27458 0.71005 1.11215 2.12085 0.55482 4.46708 4.53090
2 0 0 0 0 0 0 0 0 0 0 ... 0.84386 4.54963 4.09241 0.76311 1.63626 0.85108 1.77748 0.63992 4.60701 4.53979
3 0 0 0 0 0 0 0 0 0 0 ... 0.99371 1.43989 1.19864 -1.16884 1.63606 1.21352 1.07811 0.53820 4.53165 4.53712
4 0 0 0 0 0 0 0 0 0 0 ... 1.61258 4.17863 0.83442 -2.00100 2.93086 1.94685 3.39644 1.04180 4.53202 4.50515

5 rows × 268 columns

In [34]:
y = initial_data[['HTWins']]
y.head(3)
Out[34]:
HTWins
0 0
1 1
2 0
In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.34)
In [36]:
LR = LogisticRegression()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LR, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LR_fit_time = scores['fit_time'].mean()
LR_score_time = scores['score_time'].mean()
LR_accuracy = scores['test_accuracy'].mean()
LR_precision = scores['test_precision_macro'].mean()
LR_recall = scores['test_recall_macro'].mean()
LR_f1 = scores['test_f1_weighted'].mean()
LR_roc = scores['test_roc_auc'].mean()
Out[36]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [37]:
LR_accuracy
Out[37]:
0.6840410732576133
In [45]:
decision_tree = DecisionTreeClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(decision_tree, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
dtree_fit_time = scores['fit_time'].mean()
dtree_score_time = scores['score_time'].mean()
dtree_accuracy = scores['test_accuracy'].mean()
dtree_precision = scores['test_precision_macro'].mean()
dtree_recall = scores['test_recall_macro'].mean()
dtree_f1 = scores['test_f1_weighted'].mean()
dtree_roc = scores['test_roc_auc'].mean()
Out[45]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [47]:
dtree_accuracy
Out[47]:
0.5892932684506164
In [48]:
SVM = SVC(probability = True)

scoring = ['accuracy','precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(SVM, X_train, y_train, scoring=scoring, cv=2)

sorted(scores.keys())
SVM_fit_time = scores['fit_time'].mean()
SVM_score_time = scores['score_time'].mean()
SVM_accuracy = scores['test_accuracy'].mean()
SVM_precision = scores['test_precision_macro'].mean()
SVM_recall = scores['test_recall_macro'].mean()
SVM_f1 = scores['test_f1_weighted'].mean()
SVM_roc = scores['test_roc_auc'].mean()
Out[48]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [23]:
SVM_accuracy
Out[23]:
0.5942376585301881
In [38]:
LDA = LinearDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LDA_fit_time = scores['fit_time'].mean()
LDA_score_time = scores['score_time'].mean()
LDA_accuracy = scores['test_accuracy'].mean()
LDA_precision = scores['test_precision_macro'].mean()
LDA_recall = scores['test_recall_macro'].mean()
LDA_f1 = scores['test_f1_weighted'].mean()
LDA_roc = scores['test_roc_auc'].mean()
Out[38]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [39]:
LDA_accuracy
Out[39]:
0.6827687076675701
In [51]:
QDA = QuadraticDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(QDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
QDA_fit_time = scores['fit_time'].mean()
QDA_score_time = scores['score_time'].mean()
QDA_accuracy = scores['test_accuracy'].mean()
QDA_precision = scores['test_precision_macro'].mean()
QDA_recall = scores['test_recall_macro'].mean()
QDA_f1 = scores['test_f1_weighted'].mean()
QDA_roc = scores['test_roc_auc'].mean()
Out[51]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [52]:
QDA_accuracy
Out[52]:
0.5784729195684395
In [53]:
random_forest = RandomForestClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
forest_fit_time = scores['fit_time'].mean()
forest_score_time = scores['score_time'].mean()
forest_accuracy = scores['test_accuracy'].mean()
forest_precision = scores['test_precision_macro'].mean()
forest_recall = scores['test_recall_macro'].mean()
forest_f1 = scores['test_f1_weighted'].mean()
forest_roc = scores['test_roc_auc'].mean()
Out[53]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [54]:
forest_accuracy
Out[54]:
0.6244054683425941
In [30]:
KNN = KNeighborsClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(KNN, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
KNN_fit_time = scores['fit_time'].mean()
KNN_score_time = scores['score_time'].mean()
KNN_accuracy = scores['test_accuracy'].mean()
KNN_precision = scores['test_precision_macro'].mean()
KNN_recall = scores['test_recall_macro'].mean()
KNN_f1 = scores['test_f1_weighted'].mean()
KNN_roc = scores['test_roc_auc'].mean()
Out[30]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [31]:
KNN_accuracy 
Out[31]:
0.6169023723962039
In [32]:
bayes = GaussianNB()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(bayes, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
bayes_fit_time = scores['fit_time'].mean()
bayes_score_time = scores['score_time'].mean()
bayes_accuracy = scores['test_accuracy'].mean()
bayes_precision = scores['test_precision_macro'].mean()
bayes_recall = scores['test_recall_macro'].mean()
bayes_f1 = scores['test_f1_weighted'].mean()
bayes_roc = scores['test_roc_auc'].mean()
Out[32]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [33]:
bayes_accuracy
Out[33]:
0.651721020674379
In [38]:
models_initial = pd.DataFrame({
    'Model'       : ['Logistic Regression', 'Decision Tree', 'Support Vector Machine', 'Linear Discriminant Analysis', 'Quadratic Discriminant Analysis', 'Random Forest', 'K-Nearest Neighbors', 'Bayes'],
    'Fitting time': [LR_fit_time, dtree_fit_time, SVM_fit_time, LDA_fit_time, QDA_fit_time, forest_fit_time, KNN_fit_time, bayes_fit_time],
    'Scoring time': [LR_score_time, dtree_score_time, SVM_score_time, LDA_score_time, QDA_score_time, forest_score_time, KNN_score_time, bayes_score_time],
    'Accuracy'    : [LR_accuracy, dtree_accuracy, SVM_accuracy, LDA_accuracy, QDA_accuracy, forest_accuracy, KNN_accuracy, bayes_accuracy],
    'Precision'   : [LR_precision, dtree_precision, SVM_precision, LDA_precision, QDA_precision, forest_precision, KNN_precision, bayes_precision],
    'Recall'      : [LR_recall, dtree_recall, SVM_recall, LDA_recall, QDA_recall, forest_recall, KNN_recall, bayes_recall],
    'F1_score'    : [LR_f1, dtree_f1, SVM_f1, LDA_f1, QDA_f1, forest_f1, KNN_f1, bayes_f1],
    'AUC_ROC'     : [LR_roc, dtree_roc, SVM_roc, LDA_roc, QDA_roc, forest_roc, KNN_roc, bayes_roc],
    }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC'])

models_initial.sort_values(by='Accuracy', ascending=False)
Out[38]:
Model Fitting time Scoring time Accuracy Precision Recall F1_score AUC_ROC
0 Logistic Regression 1.945001 0.010142 0.679771 0.667080 0.655947 0.674140 0.731036
3 Linear Discriminant Analysis 0.369461 0.012189 0.677518 0.664452 0.653291 0.671691 0.730693
7 Bayes 0.045351 0.017876 0.651721 0.645880 0.650037 0.654121 0.716535
5 Random Forest 0.635183 0.022152 0.624542 0.611390 0.611524 0.624505 0.660983
6 K-Nearest Neighbors 0.060221 0.337358 0.616902 0.597895 0.592085 0.611195 0.624261
2 Support Vector Machine 19.037507 14.862141 0.594238 0.547088 0.500185 0.443153 0.603110
1 Decision Tree 1.694562 0.012115 0.583438 0.569718 0.569960 0.583961 0.569960
4 Quadratic Discriminant Analysis 0.215380 0.034924 0.571882 0.606394 0.584053 0.540395 0.597593
In [45]:
correlation = initial_data.corr()

mask = np.zeros_like(correlation, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(100, 100))

cmap = sns.diverging_palette(180, 20, as_cmap=True)
sns.heatmap(correlation, mask=mask, cmap=cmap, vmax=1, vmin =-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.show()
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a6e2cae48>
In [46]:
cmap
plt.show()
Out[46]:
<matplotlib.colors.LinearSegmentedColormap at 0x1a2d6365c0>
In [47]:
models = [LogisticRegression(),
         DecisionTreeClassifier(),
         SVC(probability = True),
         LinearDiscriminantAnalysis(),
         QuadraticDiscriminantAnalysis(),
         RandomForestClassifier(),
         KNeighborsClassifier(),
         GaussianNB()]

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
In [48]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
In [49]:
for model in models:
    scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=2)
In [50]:
print(model, scores['fit_time'].mean(), scores['score_time'].mean(), scores['test_accuracy'].mean(),
scores['test_precision_macro'].mean(), scores['test_recall_macro'].mean(), 
scores['test_f1_weighted'].mean(), scores['test_roc_auc'].mean())
GaussianNB(priors=None, var_smoothing=1e-09) 0.023267149925231934 0.08273494243621826 0.6482593037214885 0.6431793675014192 0.6480053817692566 0.6511607402997961 0.7032265388496468
In [53]:
models_ens = list(zip(['LR', 'DT', 'SVM', 'LDA', 'QDA', 'RF', 'KNN', 'NB'], models))

model_ens = VotingClassifier(estimators = models_ens, voting = 'hard')
model_ens.fit(X_train, y_train)
pred = model_ens.predict(X_test)
#prob = model_ens.predict_proba(X_test)[:,1]

acc_hard = accuracy_score(y_test, pred)
prec_hard = precision_score(y_test, pred)
recall_hard = recall_score(y_test, pred)
f1_hard = f1_score(y_test, pred)
roc_auc_hard = 'not applicable'
Out[53]:
VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('DT',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,...
                                                     n_estimators='warn',
                                                     n_jobs=None,
                                                     oob_score=False,
                                                     random_state=None,
                                                     verbose=0,
                                                     warm_start=False)),
                             ('KNN',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=5,
                                                   p=2, weights='uniform')),
                             ('NB',
                              GaussianNB(priors=None, var_smoothing=1e-09))],
                 flatten_transform=True, n_jobs=None, voting='hard',
                 weights=None)
In [54]:
model_ens = VotingClassifier(estimators = models_ens, voting = 'soft')
model_ens.fit(X_train, y_train)
pred = model_ens.predict(X_test)
prob = model_ens.predict_proba(X_test)[:,1]

acc_soft = accuracy_score(y_test, pred)
prec_soft = precision_score(y_test, pred)
recall_soft = recall_score(y_test, pred)
f1_soft = f1_score(y_test, pred)
roc_auc_soft = roc_auc_score(y_test, prob)
Out[54]:
VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('DT',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,...
                                                     n_estimators='warn',
                                                     n_jobs=None,
                                                     oob_score=False,
                                                     random_state=None,
                                                     verbose=0,
                                                     warm_start=False)),
                             ('KNN',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=5,
                                                   p=2, weights='uniform')),
                             ('NB',
                              GaussianNB(priors=None, var_smoothing=1e-09))],
                 flatten_transform=True, n_jobs=None, voting='soft',
                 weights=None)
In [55]:
models_ensembling = pd.DataFrame({
    'Model'       : ['Ensebling_hard', 'Ensembling_soft'],
    'Accuracy'    : [acc_hard, acc_soft],
    'Precision'   : [prec_hard, prec_soft],
    'Recall'      : [recall_hard, recall_soft],
    'F1_score'    : [f1_hard, f1_soft],
    'AUC_ROC'     : [roc_auc_hard, roc_auc_soft],
    }, columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC'])

models_ensembling.sort_values(by='Accuracy', ascending=False)
Out[55]:
Model Accuracy Precision Recall F1_score AUC_ROC
0 Ensebling_hard 0.685224 0.722034 0.758457 0.739797 not applicable
1 Ensembling_soft 0.676821 0.723329 0.732344 0.727809 0.722988
In [1]:
X.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-bc5f1a0adac8> in <module>
----> 1 X.shape

NameError: name 'X' is not defined
In [28]:
 
Out[28]:
(9520, 199)
In [50]:
tester.shape
X.shape
new=pd.concat([X,tester])
new.shape
Out[50]:
(1648, 268)
Out[50]:
(9520, 268)
Out[50]:
(11168, 268)
In [57]:
y=pd.concat([y,y[0:1648]])
In [57]:
y.shape
Out[57]:
(9520, 1)
In [58]:
X.shape
Out[58]:
(9520, 268)
In [59]:
lsvc = LinearSVC(C=0.05, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_svc = model.transform(X)
X_svc.shape #reduction from 30 to 10 featuresb
Out[59]:
(9520, 192)
In [74]:
lsvc = LinearSVC(C=0.05, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
In [90]:
X_svc = model.transform(X)
X_svc.shape #red
Out[90]:
(9520, 193)
In [91]:
test_svc = model.transform(tester)
test_svc.shape
Out[91]:
(1648, 193)
In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_svc,y,test_size=0.30)
Error in parse(text = x, srcfile = src): <text>:1:8: unexpected ','
1: X_train,
           ^
Traceback:
In [73]:
pd.DataFrame(X_svc).to_csv('new.csv')
In [128]:
np.random.seed(123)
LR = LogisticRegression()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LR, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LR_fit_time = scores['fit_time'].mean()
LR_score_time = scores['score_time'].mean()
LR_accuracy = scores['test_accuracy'].mean()
LR_precision = scores['test_precision_macro'].mean()
LR_recall = scores['test_recall_macro'].mean()
LR_f1 = scores['test_f1_weighted'].mean()
LR_roc = scores['test_roc_auc'].mean()
Out[128]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [129]:
LR_accuracy
Out[129]:
0.6905851448520502
In [106]:
LR.fit(X_train,y_train)
Out[106]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [109]:
y_pred = LR.predict(test_svc)
In [2]:
y_pred.shape
Error in eval(expr, envir, enclos): object 'y_pred.shape' not found
Traceback:
In [1]:
tester['HTWins']=y_pred
tester['HTWins']=tester['HTWins'].map({1:"Yes",0:"No"})
idd= pd.read_csv('/Users/MichaelMiao/Documents/career/Jupyter_files/Python/Yuyan/test_hot.csv',index_col=None)
submit=idd[['id']]
submit['HTWins']=tester['HTWins']
submit.head(3)
Error in parse(text = x, srcfile = src): <text>:2:34: unexpected symbol
1: tester['HTWins']=y_pred
2: tester['HTWins']=tester['HTWins'].map
                                    ^
Traceback:
In [131]:
submit.to_csv('submitxx.csv',index=False)
In [116]:
 
In [120]:
 
In [121]:
 
In [122]:
 
Out[122]:
id HTWins
0 11609 Yes
1 11625 No
2 13795 No
In [123]:
decision_tree = DecisionTreeClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(decision_tree, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
dtree_fit_time = scores['fit_time'].mean()
dtree_score_time = scores['score_time'].mean()
dtree_accuracy = scores['test_accuracy'].mean()
dtree_precision = scores['test_precision_macro'].mean()
dtree_recall = scores['test_recall_macro'].mean()
dtree_f1 = scores['test_f1_weighted'].mean()
dtree_roc = scores['test_roc_auc'].mean()
Out[123]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [125]:
dtree_accuracy
Out[125]:
0.5924155305610709
In [65]:
SVM = SVC(probability = True)

scoring = ['accuracy','precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(SVM, X_train, y_train, scoring=scoring, cv=2)

sorted(scores.keys())
SVM_fit_time = scores['fit_time'].mean()
SVM_score_time = scores['score_time'].mean()
SVM_accuracy = scores['test_accuracy'].mean()
SVM_precision = scores['test_precision_macro'].mean()
SVM_recall = scores['test_recall_macro'].mean()
SVM_f1 = scores['test_f1_weighted'].mean()
SVM_roc = scores['test_roc_auc'].mean()
Out[65]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [66]:
SVM_accuracy
Out[66]:
0.592551369325147
In [126]:
LDA = LinearDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
LDA_fit_time = scores['fit_time'].mean()
LDA_score_time = scores['score_time'].mean()
LDA_accuracy = scores['test_accuracy'].mean()
LDA_precision = scores['test_precision_macro'].mean()
LDA_recall = scores['test_recall_macro'].mean()
LDA_f1 = scores['test_f1_weighted'].mean()
LDA_roc = scores['test_roc_auc'].mean()
Out[126]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [127]:
LDA_accuracy
Out[127]:
0.6873973693450302
In [71]:
QDA = QuadraticDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(QDA, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
QDA_fit_time = scores['fit_time'].mean()
QDA_score_time = scores['score_time'].mean()
QDA_accuracy = scores['test_accuracy'].mean()
QDA_precision = scores['test_precision_macro'].mean()
QDA_recall = scores['test_recall_macro'].mean()
QDA_f1 = scores['test_f1_weighted'].mean()
QDA_roc = scores['test_roc_auc'].mean()
Out[71]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [72]:
QDA_accuracy
Out[72]:
0.4661757889429284
In [73]:
random_forest = RandomForestClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
forest_fit_time = scores['fit_time'].mean()
forest_score_time = scores['score_time'].mean()
forest_accuracy = scores['test_accuracy'].mean()
forest_precision = scores['test_precision_macro'].mean()
forest_recall = scores['test_recall_macro'].mean()
forest_f1 = scores['test_f1_weighted'].mean()
forest_roc = scores['test_roc_auc'].mean()
Out[73]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [75]:
forest_accuracy
Out[75]:
0.6294789337969167
In [76]:
KNN = KNeighborsClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(KNN, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
KNN_fit_time = scores['fit_time'].mean()
KNN_score_time = scores['score_time'].mean()
KNN_accuracy = scores['test_accuracy'].mean()
KNN_precision = scores['test_precision_macro'].mean()
KNN_recall = scores['test_recall_macro'].mean()
KNN_f1 = scores['test_f1_weighted'].mean()
KNN_roc = scores['test_roc_auc'].mean()
Out[76]:
['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_macro',
 'test_recall_macro',
 'test_roc_auc']
In [78]:
KNN_accuracy 
Out[78]:
0.6081519440495652
In [ ]: